###########################################################################
## Challenging Encounters and Within-Physician Practice Variability
## Script 1- Identify difficult cases (Events)
## Last update: 29-1-23
###########################################################################

## Setup
###########################################################################

# Packages
library(dplyr)     
library(readxl)   
library(haven)    
library(lubridate)

# Directory
wd <- "."    
setwd(wd)                                              
pd <- "./project"

# Functions
getmode<-function(v) {
  uniqv<-unique(v)
  uniqv[which.max(tabulate(match(v,uniqv)))]
}

###########################################################################

## Identify new cancer diagnosis
###########################################################################

# Import cancer Diagnoses data
Diag <- readRDS(file.path("R_Mac","R_files","Diagnosis"))
Diag <- Diag[Diag$ICD9>149&Diag$ICD9<210,]

# Most common cancer type
Breast <- Diag[Diag$ICD9>173.999&Diag$ICD9<176,]
Colon <- Diag[Diag$ICD9>152.999&Diag$ICD9<154,]
Stomach <- Diag[Diag$ICD9>150.999&Diag$ICD9<152,]
Rectum <- Diag[Diag$ICD9>153.999&Diag$ICD9<155,]
Liver <- Diag[Diag$ICD9>154.999&Diag$ICD9<156,]
Pancreas <- Diag[Diag$ICD9>156.999&Diag$ICD9<158,]
Lung <- Diag[Diag$ICD9>161.999&Diag$ICD9<163,]
Melanome <- Diag[Diag$ICD9>171.999&Diag$ICD9<173,]
Ovary <- Diag[Diag$ICD9>178.999&Diag$ICD9<184,]
Prostate <- Diag[Diag$ICD9>184.999&Diag$ICD9<186,]
Bladder <- Diag[Diag$ICD9>187.999&Diag$ICD9<189,]
Kidney <- Diag[Diag$ICD9>188.999&Diag$ICD9<190,]
Thyroid <- Diag[Diag$ICD9>192.999&Diag$ICD9<194,]
Lymphoma <- Diag[Diag$ICD9>201.999&Diag$ICD9<203,]
Myeloma <- Diag[Diag$ICD9>202.999&Diag$ICD9<204,]
Leukemia <- Diag[Diag$ICD9>203.999&Diag$ICD9<205,]

# Clean
rm(Diag)
gc()

# Function - Keep diagnoses with no other diagnoses in the m months before
keep_first <- function(objectname,m){
  objectname <- objectname[order(objectname$patient_id,objectname$date),]
  objectname$keep <- 1
  objectname$keep[objectname$patient_id==dplyr::lag(objectname$patient_id)&
                    objectname$date<(dplyr::lag(objectname$date)+days(m*30))] <- 0
  objectname<-objectname[objectname$keep==1,]
  return(objectname[,-8])
  
}

Breast <- keep_first(Breast,6)
Colon <- keep_first(Colon,6)
Lung <- keep_first(Lung,6)
Rectum <- keep_first(Rectum,6)
Prostate <- keep_first(Prostate,6)
Pancreas <- keep_first(Pancreas,6)
Stomach <- keep_first(Stomach,6)
Liver <- keep_first(Liver,6)
Bladder <- keep_first(Bladder,6)
Ovary <- keep_first(Ovary,6)
Melanome <- keep_first(Melanome,6)
Thyroid <- keep_first(Thyroid,6)
Kidney <- keep_first(Kidney,6)
Myeloma <- keep_first(Myeloma,6)
Leukemia <- keep_first(Leukemia,6)
Lymphoma <- keep_first(Lymphoma,6)

###########################################################################

## Validate against the Israeli Cancer Registry
###########################################################################

## Import Cancer Registry Data
Cancer <- read_csv("raw_data/Cancer.csv")
colnames(Cancer) <- c("patient_id","cancer_code","date_from_cancer","date_of_cancer","status")
Cancer <- Cancer[,c("patient_id","date_from_cancer","status")]
Cancer$date_from_cancer <- as.Date(as.character(Cancer$date_from_cancer))
Cancer <- Cancer[year(Cancer$date_from_cancer)!=1800,]

# Function 2 -- Keep only diagnoses with a registry within 30 days
validate_registry <- function(objectname){
  newname <- merge(objectname,Cancer,by="patient_id")
  newname$diff <- difftime(newname$date,newname$date_from_cancer,units="days")
  newname <- newname[newname$diff>-30&newname$diff<30,]
  newname <- newname[order(abs(newname$diff)),]
  newname <- distinct(newname,patient_id,date,.keep_all=T)
  return(newname)
}
Col_Reg <- validate_registry(Colon)
Bre_Reg <- validate_registry(Breast)
Lun_Reg <- validate_registry(Lung)
Bla_Reg <- validate_registry(Bladder)
Mel_Reg <- validate_registry(Melanome)
Ova_Reg <- validate_registry(Ovary)
Pan_Reg <- validate_registry(Pancreas)
Pro_Reg <- validate_registry(Prostate)
Rec_Reg <- validate_registry(Rectum)
Sto_Reg <- validate_registry(Stomach)
Thy_Reg <- validate_registry(Thyroid)
Liv_Reg <- validate_registry(Liver)
Kid_Reg <- validate_registry(Kidney)
Mye_Reg <- validate_registry(Myeloma)
Leu_Reg <- validate_registry(Leukemia)
Lym_Reg <- validate_registry(Lymphoma)
###########################################################################

## Events File
###########################################################################

# Mark cancer types
Bre_Reg$Cancer<-"Breast"
Lun_Reg$Cancer<-"Lung"
Col_Reg$Cancer<-"Colon"
Bla_Reg$Cancer<-"Bladder"
Mel_Reg$Cancer<-"Melanome"
Ova_Reg$Cancer<-"Ovary"
Pan_Reg$Cancer<-"Pancreas"
Pro_Reg$Cancer<-"Prostate"
Rec_Reg$Cancer<-"Rectum"
Sto_Reg$Cancer<-"Stomach"
Thy_Reg$Cancer<-"Thyroid"
Liv_Reg$Cancer<-"Liver"
Kid_Reg$Cancer<-"Kidney"
Mye_Reg$Cancer<-"Myeloma"
Leu_Reg$Cancer<-"Leukemia"
Lym_Reg$Cancer<-"Lymphoma"

# Append
Diag_Reg <- rbind(Bre_Reg,Lun_Reg,Col_Reg,Bla_Reg,Mel_Reg,Ova_Reg,Pan_Reg,Pro_Reg,
                Rec_Reg,Sto_Reg,Thy_Reg,Liv_Reg,Kid_Reg,Mye_Reg,Leu_Reg,Lym_Reg)
Diag <- Diag_Reg[!is.na(Diag_Reg$doctor_id),]

# Mark first event for patient
Diag <- Diag[order(Diag$patient_id,Diag$date),]
Diag <- mutate(group_by(Diag,patient_id),serial=1:n())
Diag$First <- 0
Diag$First[Diag$serial==1] <- 1

# Events data
colnames(Diag)[2] <- "diag_date"
Events <- Diag[,c(1:4,8:11,13)]
colnames(Events)[3] <- "diag_doc_id"

# Import Event's patients' primary care visits after the diagnosis
Visits <- readRDS(file.path("R_Mac","R_files","Visits"))
Visits <- Visits[Visits$specialization_code==10,]
Visits <- merge(Events,Visits,by="patient_id")
Visits$diff <- difftime(Visits$date,Visits$diag_date,units="days")
Visits <- Visits[Visits$diff>-1,]

# Keep only the first visit after event
Visits <- Visits[order(Visits$time),]
Visits <- distinct(Visits,patient_id,diag_date,.keep_all = T)

# Keep only Visits during a period of 30 days after the event
Visits <- Visits[Visits$diff<31,]

# Add patient details
Patients <- readRDS(file.path("R_Mac","R_files","Patients"))
Events_File <- merge(Visits,Patients,by="patient_id")
Events_File$age <- year(Events_File$date)-Events_File$birth_date

# Save events file
Events_File <- Events_File[,c(1:2,8,9,10,11,12,24,26)]
colnames(Events_File)[7]<-"doctor_id"
saveRDS(Events_File,file.path(pd,"data","Events"))

###########################################################################
